# Load Necessary Packages ------------------------------------------------------

#install.packages("pacman")
library(pacman)

### Note, the following lines install packages, double-check if this desired ###

# Tidyverse
p_load(dplyr, forcats, ggplot2, srvyr, tidyr, naniar)

# Estimation
p_load(fixest, margins, etable, lmtest)

# Misc
p_load(here, readstata13, survey, patchwork, corrplot, stargazer)

# Read in ACCESS full dataset --------------------------------------------------

pool <- read.dta13(here("Data", "CEEW - ACCESS20152018 Appended.dta"),
                   convert.factors = T)

weights <- read.dta13(here("Data", "Weights", "weights.dta"))

pool <- pool %>% 
  mutate(year = as.numeric(year)) %>% 
  left_join(weights, pool, by = c("year","m1_q11_village_code"))

pool <- pool %>% 
  arrange(finalhhid,year) %>% 
  rename(age = m1_q19_age,
         gender = m1_q20_gender,
         state = m1_q8_state_code,
         district = m1_q9_district_code,
         village = m1_q11_village_code,
         hh = finalhhid,
         month_exp = m1_q32_month_expenditure,
         education = m1_q23_edu,
         duration_day = m2_q69_elec_hrs,
         duration_night = m2_q70_elec_night_hrs,
         reliability = m2_q71_elec_out_days,
         quality_fluc = m2_q72_elec_equi_suffer_days,
         quality_low = m2_q73_elec_equi_low_days,
         grid_spend = m2_q55_3_grid_spending,
         mg_spend = m2_q58_2_micro_spending,
         others_steal = m3_q95_elec_steal
         ) %>%
  # Remove (1) observation with missing caste information
  filter(m1_q25_caste != "5") %>%
  mutate(
    caste = case_when(m1_q25_caste == 1 ~ "Scheduled Caste",
                                     m1_q25_caste == 2 ~ "Scheduled Tribe",
                                     m1_q25_caste == 3 ~ "Other Backward Class",
                                     m1_q25_caste == 4 ~ "General",
                                     TRUE ~ NA_character_),
    religion = case_when(m1_q24_religion == 1 ~ "Hindu",
                          m1_q24_religion == 2 ~ "Muslim",
                          m1_q24_religion == 3 ~ "Other",
                          TRUE ~ NA_character_),
    wave = as.integer(year == 2018),
    scst = as.integer(caste == "Scheduled Caste" | caste == "Scheduled Tribe"),
    sc = as.integer(caste == "Scheduled Caste"),
    st = as.integer(caste == "Scheduled Tribe"),
    hhsize = m1_q27_no_adults + m1_q29_no_children,
    lnexp = log(month_exp + 1),
    bplaay = as.integer(m1_q26_ration == "BPL" | m1_q26_ration == "Antyodaya"),
    usegrid = as.integer(m2_q55_grid == "Yes"),
    uselpg = as.integer(m4_q103_lpg == "Yes"),
    convenience = as.integer(m4_q103_14_lpg_cyl_deliver == "Yes"),
    kero_spend = (m2_q61_4_kero_liter_PDS * m2_q61_5_kero_price_PDS) +
      (m2_q61_6_kero_liter_mkt * m2_q61_7_kero_price_mkt)
    ) %>%
  mutate(# set duration night to NA if this exceeds the survey threshold (6)
    duration_night = case_when(
      duration_night > 6 ~ NA_real_,
      TRUE ~ duration_night)
    ) %>% 
  select(state, district, village, hh, hhsize, year, wave, age, gender, caste, 
         scst, sc, st, religion, education, month_exp, bplaay, lnexp, usegrid, 
         duration_day, duration_night, reliability, quality_fluc, quality_low,
         uselpg, convenience, grid_spend, mg_spend, kero_spend, others_steal,
         weights)

# Visualise Missing Data -------------------------------------------------------

pool %>%
  filter(year == 2015) %>% 
  naniar::gg_miss_var(show_pct = T)

pool %>%
  filter(year == 2018) %>% 
  naniar::gg_miss_var(show_pct = T)

# Create explanatory variables for summary tables
pool <- pool %>% 
  mutate(
    gender_male = as.integer(gender == "1"),
    religion_hindu = as.integer(religion == "Hindu"),
    religion_muslim = as.integer(religion == "Muslim"),
    religion_others = as.integer(religion == "Others"),
    education_below10th = as.integer(education == "No Formal Schooling" |
                                       education == "Up to 5th Standard"),
    education_atleast10th = 
      as.integer(
        education == "Up to 10th Standard" |
          education == "12th Standard or Diploma" |
          education == "Graduate and Above")
  )

# Create weighted survey design object for later analysis ----------------------

svydesign <- svydesign(id=~hh, strata=~village, data =pool, 
                       weight =~weights, nest = TRUE)

# Create observation count tables ---------------------------------------------- 

pool_obs <- pool %>% group_by(wave,state) %>% 
  summarise(district = length(unique(district)),
            village = length(unique(village)),
            households = n(),
            scprop = scales::percent(mean(sc)),
            stprop = scales::percent(mean(st))
  ) %>%
  ungroup() %>% 
  arrange(state) %>% 
  mutate(
    wave = case_when(
      wave == 0 ~ 2014,
      wave == 1 ~ 2018),
    state = case_when(state == 9 ~ "Uttar Pradesh",
                      state == 10 ~ "Bihar",
                      state == 19 ~ "West Bengal",
                      state == 20 ~ "Jharkhand",
                      state == 21 ~ "Odisha",
                      state == 23 ~ "Madhya Pradesh")
    ) %>% 
  rename("Wave" = "wave", "State" = "state", "Districts" = "district",
         "Villages" = "village", "Households" = "households", 
         "SC Proportion" = "scprop", "ST Proportion" = "stprop")
    
stargazer(pool_obs, summary = FALSE, float = FALSE, rownames = F,
          out = "./Manuscript/Tables/pool_obs.tex")

# Summary statistics of control variables --------------------------------------

ctrlvars <- c("age","gender_male","religion_hindu","religion_muslim",
              "religion_others", "education_below10th", "education_atleast10th",
              "hhsize", "month_exp", "bplaay")

source(here("R","sumstats_ctrl.R"))

stargazer(svysummary_ctrl_wave1, summary = FALSE, float = FALSE, rownames = F,
          notes = c("Note: Estimates are weighted by sample design weights."),
          notes.align = "r",
          out = "./Manuscript/Tables/sumstatsctrl_pool_weighted_wave1.tex")

stargazer(svysummary_ctrl_wave2, summary = FALSE, float = FALSE, rownames = F,
          notes = c("Note: Estimates are weighted by sample design weights."),
          notes.align = "r",
          out = "./Manuscript/Tables/sumstatsctrl_pool_weighted_wave2.tex")

# Create correlation tables of control variables -------------------------------

poolctrlvars <- pool %>% 
  select(wave, scst, "Age" = age, "BPL/AAY Household" = bplaay,
         "Education >= 10th Grade" = education_atleast10th,
         "Gender == Male" = gender_male, "Household Size" = hhsize,
         "Monthly Expenditure" = month_exp, 
         "Religion == Hindu" = religion_hindu, 
         "SC/ST Household" = scst)

corrpoolctrlvars_wave0 <- poolctrlvars %>% 
  filter(wave == 0) %>% 
  select(-wave) %>% 
  cor()

png(height=600, width=600, pointsize = 15, 
    file= here("Manuscript","Figures","corrplotctrlvars_wave0.png"))

corrplot(corrpoolctrlvars_wave0, type = "lower", order = "alphabet",
         addCoef.col = "black", tl.col = "black",
         method = "color", diag = F)

dev.off()

corrpoolctrlvars_wave1 <- poolctrlvars %>% na.omit() %>% 
  filter(wave == 1) %>% 
  select(-wave) %>% 
  cor()

png(height=600, width=600, pointsize = 15, 
    file= here("Manuscript","Figures","corrplotctrlvars_wave1.png"))

corrplot(corrpoolctrlvars_wave1, type = "lower", order = "alphabet",
         addCoef.col = "black", tl.col = "black",
         method = "color", diag = F)

dev.off()

# Summary statistics of response variables -------------------------------------

acc_respvars <- c("usegrid", "uselpg")
lpg_respvars <- c("convenience")
elec_respvars <- c("duration_day", "duration_night", "reliability", 
                   "quality_low", "quality_fluc")

source(here("R","sumstats_resp.R")) 

stargazer(svysummary_resp_wave1, summary = FALSE, float = FALSE, rownames = F,
          notes = c("Note: Estimates are weighted by sample design weights."),
          notes.align = "r",
          out = "./Manuscript/Tables/sumstatsresp_pool_weighted_wave1.tex")

stargazer(svysummary_resp_wave2, summary = FALSE, float = FALSE, rownames = F,
          notes = c("Note: Estimates are weighted by sample design weights."),
          notes.align = "r",
          out = "./Manuscript/Tables/sumstatsresp_pool_weighted_wave2.tex")

# Calculate weighted difference in means (weighted t-test) ---------------------
svysummary_acc_resp_diffs <- data.frame(matrix(ncol = 6, nrow = 0))
svysummary_lpg_resp_diffs <- data.frame(matrix(ncol = 6, nrow = 0))
svysummary_elec_resp_diffs <- data.frame(matrix(ncol = 6, nrow = 0))

for (i in acc_respvars) {
  for (j in c(0,1)) {
    acc_subset = subset(svydesign, scst == j)
    t <- svyttest(as.formula(paste(i,"~","wave")), design = acc_subset)
    df <- data.frame(attribute = i,
                     scst = j,
                     diff = signif(t$estimate["difference in mean"],3),
                     pvalue = round(t$p.value["wave"],3),
                     ci = round(t$conf.int,2))
    svysummary_acc_resp_diffs <- rbind(svysummary_acc_resp_diffs, df)
    rm(acc_subset,t,df)
  }
}

for (i in lpg_respvars) {
  for (j in c(0,1)) {
    lpg_subset = subset(svydesign, scst == j & uselpg == 1)
    t <- svyttest(as.formula(paste(i,"~","wave")), design = lpg_subset)
    df <- data.frame(attribute = i,
                     scst = j,
                     diff = signif(t$estimate["difference in mean"],3),
                     pvalue = round(t$p.value["wave"],3),
                     ci = round(t$conf.int,2))
    svysummary_lpg_resp_diffs <- rbind(svysummary_lpg_resp_diffs, df)
    rm(lpg_subset,t,df)
  }
}

for (i in elec_respvars) {
  for (j in c(0,1)) {
    elec_subset = subset(svydesign, scst == j & usegrid == 1)
    t <- svyttest(as.formula(paste(i,"~","wave")), design = elec_subset)
    df <- data.frame(attribute = i,
                     scst = j,
                     diff = signif(t$estimate["difference in mean"],3),
                     pvalue = round(t$p.value["wave"],3),
                     ci = round(t$conf.int,2))
    svysummary_elec_resp_diffs <- rbind(svysummary_elec_resp_diffs, df)
    rm(elec_subset, t,df)
  }
}

svysummary_resp_diffs = rbind(svysummary_acc_resp_diffs, 
                              svysummary_lpg_resp_diffs, 
                              svysummary_elec_resp_diffs)

svysummary_resp_wide = merge(svysummary_resp %>% 
                               filter(wave == 0) %>% 
                               select(-c(wave)),
                             svysummary_resp %>% 
                               filter(wave == 1) %>% 
                               select(-c(wave)),
                             by = c("attribute","scst"))

svysummary_resp_diffs_wide <- merge(svysummary_resp_wide, svysummary_resp_diffs, 
                                    by = c("attribute","scst")) %>% 
  mutate(diff = paste(diff, " (", ci.2.5.., ",", ci.97.5.., ")", sep = ""),
         scst = as.factor(scst),
         attribute = as.factor(attribute)) %>% 
  select(attribute, scst, mean.x, mean.y, diff, pvalue)

svysummary_resp_diffs_wide <- svysummary_resp_diffs_wide %>% 
  mutate(attribute = fct_recode(attribute,
                                "Grid Connection (==1)" = "usegrid",
                                "LPG Connection (==1)" = "uselpg",
                                "LPG Home Delivery (==1)" = "convenience",
                                "Daily Electricity Hours" = "duration_day",
                                "Evening Electricity Hours" = "duration_night",
                                "Monthly Outage Days" = "reliability", 
                                "Monthly Voltage Fluctuation Days" = "quality_fluc",
                                "Monthly Low Voltage Days" = "quality_low"),
         scst = fct_recode(scst,
                           "SC/ST" = "1",
                           "All Others" = "0")) %>% 
  arrange(attribute)

colnames(svysummary_resp_diffs_wide) <- c("Attribute", "Caste", 
                                          "Mean 2014", "Mean 2018", 
                                          "Difference (CI)","P-value")

stargazer(svysummary_resp_diffs_wide, summary = FALSE, float = FALSE, rownames = F,
          notes = c("Note: Sampling weights are used to generate the estimates and confidence intervals provided in parenthesis."),
          notes.align = "r",
          out = "./Manuscript/Tables/diffs_pool_weighted.tex")

# Remove summary statistics objects clutter
rm(svysummary_ctrl, svysummary_ctrl_wave1, svysummary_ctrl_wave2,
   svysummary_resp_clean, svysummary_resp_wave1, svysummary_resp_wave2,
   svysummary_resp_wide, svysummary_resp_diffs, svysummary_resp_diffs_wide,
   svysummary_acc_resp, svysummary_acc_resp_diffs, svysummary_lpg_resp, 
   svysummary_lpg_resp_diffs, svysummary_elec_resp, svysummary_elec_resp_diffs, 
   i, j, k, corrpoolctrlvars_wave0, corrpoolctrlvars_wave1,
   poolctrlvars, svysummary_resp, pool_obs)

# Weighted descriptive analysis using pooled cross-sectional dataset -----------

# Visalise differences in energy access for SC/ST relative to All Others
source(here("R", "descr_vis_new.R"))

# Regression Analysis ----------------------------------------------------------
# Note: OLS with Village & Household Fixed Effects (both pool and panel)
source(here("R", "dd_access_vfe.hfe_scst.R"))
source(here("R", "dd_elec_vfe.hfe_scst.R"))

# Disaggregate analysis by SC and ST groups -----------------------------------
source(here("R", "dd_access_vfe.hfe_bplaay.R"))
source(here("R", "dd_elec_vfe.hfe_bplaay.R"))

# State-wise Analysis ----------------------------------------------------------

# Replicates the stata lincom command, using robust clustered standard errors
# for our specific case of one interaction term (wave) and one binary 
# variable (scst) interacted

lincom_custom <- function (fit, interactionterm, var1) {
  
  sumvar1interaction <- fit$coefficients[[interactionterm]] + fit$coefficients[[var1]]

  vcov <- summary(fit)$cov.scaled
  
  sevar1interaction <- sqrt(
    fit[["se"]][interactionterm]^2 +
      fit[["se"]][var1]^2 +
      2 * vcov[interactionterm, var1])
  
  lincom_df <- tibble(.rows = 2,
                      "term" = c(interactionterm,var1),
                      "coef" = c(fit$coefficients[[interactionterm]],
                                 sumvar1interaction),
                      "se" = c(fit[["se"]][[interactionterm]],
                               sevar1interaction))
  
  return(lincom_df)
  
}

# Energy access outcomes

source(here("R", "Heterogeneity Analysis", "dd_access_vfe.hfe_bpl.scst_state_BI.R"))
source(here("R", "Heterogeneity Analysis", "dd_access_vfe.hfe_bpl.scst_state_JK.R"))
source(here("R", "Heterogeneity Analysis", "dd_access_vfe.hfe_bpl.scst_state_MP.R"))
source(here("R", "Heterogeneity Analysis", "dd_access_vfe.hfe_bpl.scst_state_OD.R"))
source(here("R", "Heterogeneity Analysis", "dd_access_vfe.hfe_bpl.scst_state_UP.R"))
source(here("R", "Heterogeneity Analysis", "dd_access_vfe.hfe_bpl.scst_state_WB.R"))

allstates <- c("bi" = state_bi_fits, "jk" = state_jk_fits, 
               "mp" = state_mp_fits, "od" = state_od_fits,
               "up" = state_up_fits, "wb" = state_wb_fits)

coefs_access <- tibble(.rows = 0,
                "state" = c(""),
                "model" = c(""),
                "term" = c(""),
                "coef" = c(""),
                "se" = c(""))

n <- 1

for (i in allstates) {
  df <- lincom_custom(i, "wave", "scst:wave") %>% 
    mutate(state = names(allstates[n]),
           model = as.character(i[["call"]][["fml"]][[2]]))
  coefs_access <- rbind(coefs_access, df)
  n <- n + 1
}

coefs_access %>% 
  mutate(state = case_when(
    grepl(state, pattern = "bi") ~ "BI",
    grepl(state, pattern = "jk") ~ "JK",
    grepl(state, pattern = "mp") ~ "MP",
    grepl(state, pattern = "od") ~ "OD",
    grepl(state, pattern = "up") ~ "UP",
    grepl(state, pattern = "wb") ~ "WB"),
    model = case_when(
      grepl(model, pattern = "lpg") ~ "LPG Connection",
      grepl(model, pattern = "convenience") ~ "LPG Home Delivery",
      grepl(model, pattern = "grid") ~ "Grid Connection"),
    term = case_when(
      grepl(term, pattern = "scst") ~ "SC/ST",
      TRUE ~ "All Others")
  ) %>% 
  ggplot(aes(state, coef, colour = term, shape = term,
             ymin = coef - 1.96 * se,
             ymax = coef + 1.96 * se)) +
  geom_point(position = position_dodge(width = 0.5)) +
  geom_errorbar(position = position_dodge(width = 0.5), width = 0.5) +
  geom_hline(yintercept = 0, colour = "black", linetype = 2) +
  facet_wrap(~model, ncol = 2) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(x = "State",
       y = "Access likelihood change in percentage points",
       colour = "Social Group",
       shape = "Social Group"
  ) +
  coord_cartesian(ylim = c(-0.1,0.6)) +
  theme_bw() +
  theme(strip.background = element_rect(fill = NA, color = "black"),
        legend.position = c(0.75,0.25))

ggsave(here("Manuscript","Figures","improvement_state_access.png"),
       device = "png", width = 6, height = 6, units = "in")

# Electricity supply outcomes

source(here("R", "Heterogeneity Analysis", "dd_elec_vfe.hfe_bpl.scst_state_BI.R"))
source(here("R", "Heterogeneity Analysis", "dd_elec_vfe.hfe_bpl.scst_state_JK.R"))
source(here("R", "Heterogeneity Analysis", "dd_elec_vfe.hfe_bpl.scst_state_MP.R"))
source(here("R", "Heterogeneity Analysis", "dd_elec_vfe.hfe_bpl.scst_state_OD.R"))
source(here("R", "Heterogeneity Analysis", "dd_elec_vfe.hfe_bpl.scst_state_UP.R"))
source(here("R", "Heterogeneity Analysis", "dd_elec_vfe.hfe_bpl.scst_state_WB.R"))

allstates <- c("bi" = state_bi_fits, "jk" = state_jk_fits, 
               "mp" = state_mp_fits, "od" = state_od_fits,
               "up" = state_up_fits, "wb" = state_wb_fits)

coefs_elec <- tibble(.rows = 0,
                "state" = c(""),
                "model" = c(""),
                "term" = c(""),
                "coef" = c(""),
                "se" = c(""))

n <- 1

for (i in allstates) {
  df <- lincom_custom(i, "wave", "scst:wave") %>% 
    mutate(state = names(allstates[n]),
           model = as.character(i[["call"]][["fml"]][[2]]))
  coefs_elec <- rbind(coefs_elec, df)
  n <- n + 1
}

coefs_elec %>% 
  mutate(state = case_when(
    grepl(state, pattern = "bi") ~ "BI",
    grepl(state, pattern = "jk") ~ "JK",
    grepl(state, pattern = "mp") ~ "MP",
    grepl(state, pattern = "od") ~ "OD",
    grepl(state, pattern = "up") ~ "UP",
    grepl(state, pattern = "wb") ~ "WB"),
    model = case_when(
      grepl(model, pattern = "day") ~ "Daily Supply Hours",
      grepl(model, pattern = "night") ~ "Evening Supply Hours",
      grepl(model, pattern = "reliability") ~ "Outage Days",
      grepl(model, pattern = "fluc") ~ "Fluctuation Voltage Days",
      grepl(model, pattern = "low") ~ "Low Voltage Days"),
    term = case_when(
      grepl(term, pattern = "scst") ~ "SC/ST",
      TRUE ~ "All Others")
  ) %>% 
  ggplot(aes(state, coef, colour = term, shape = term,
             ymin = coef - 1.96 * se,
             ymax = coef + 1.96 * se)) +
  geom_point(position = position_dodge(width = 0.5)) +
  geom_errorbar(position = position_dodge(width = 0.5), width = 0.5) +
  geom_hline(yintercept = 0, colour = "black", linetype = 2) +
  facet_wrap(~model) +
  labs(x = "State",
       y = "Absolute change in the outcome variable",
       colour = "Social Group",
       shape = "Social Group"
  ) +
  theme_bw() +
  theme(strip.background = element_rect(fill = NA, color = "black"), 
        legend.position = c(0.85,0.25))

ggsave(here("Manuscript","Figures","improvement_state_elec.png"),
       device = "png", width = 8, height = 10*0.618, units = "in")

# Robustness Analysis ----------------------------------------------------------
# Note: Probit regressions using pooled cross-section for binary response vars
source(here("R", "dd_access_vfe_probit_scst.R"))
